Source Code of org.terrier.indexing.TaggedDocument

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is TaggedDocument.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
 *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>  
 */
package org.terrier.indexing;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.Stack;


import org.apache.log4j.Logger;
import org.terrier.indexing.tokenisation.EnglishTokeniser;
import org.terrier.indexing.tokenisation.TokenStream;
import org.terrier.indexing.tokenisation.Tokeniser;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.TagSet;


/**
 * Models a tagged document (e.g., an HTML or TREC document). In particular,
 * {@link #getNextTerm() getNextTerm()} returns the next token in the current
 * chunk of text, according to the specified tokeniser. This class replaces
 * {@link HTMLDocument} and {@link TRECDocument}.
 * 
 * This class uses the following properties:
 * <ul>
 * <li><tt>tokeniser</tt>, the tokeniser class to be used (defaults to EnglishTokeniser);</li>
 * <li><tt>max.term.length</tt>, the maximum length in characters of a term (defaults to 20);</li>
 * <li><tt>lowercase</tt>, whether characters are transformed to lowercase (defaults to <tt>true</tt>).</li>
 * <li><tt>TaggedDocument.abstracts</tt> - names of the abstracts to be saved for query-biased summarisation. 
 * Defaults to empty. Example: <tt>TaggedDocument.abstracts=title,abstract</tt></li>
 * <li><tt>TaggedDocument.abstracts.tags</tt> - names of tags to save text from for the purposes of 
 * query-biased summarisation. Example: <tt>TaggedDocument.abstracts=title,body</tt>. ELSE is special
 * tag name, which means anything not consumed by other tags.</li>
 * <li><tt>TaggedDocument.abstracts.lengths</tt> - max lengths of the asbtrcts. Defaults to empty. 
 * Example: <tt>TaggedDocument.abstracts.lengths=100,2048</tt></li>
 * <li><tt>TaggedDocument.abstracts.tags.casesensitive</li> - should the names of tags be case-sensitive? Defaults to false.</li>
 * </ul>
 * @author Craig Macdonald, Vassilis Plachouras, Richard McCreadie, Rodrygo Santos
 * @since 3.5
 */
public class TaggedDocument implements Document {
  protected static final Logger logger = Logger.getLogger(TaggedDocument.class);
  /** The maximum length of a token in the check method. */
  protected final static int tokenMaximumLength = ApplicationSetup.MAX_TERM_LENGTH;
  
  /** Change to lowercase? */
  protected final static boolean lowercase = Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true"));
  
  /** A temporary String array*/
  protected final String[] stringArray = new String[1];
  
  /** The input reader. */
  protected Reader br;
  
  /** End of Document. Set by the last couple of lines in getNextTerm() */
  protected boolean EOD = false;
  
  /** The number of bytes read from the input.*/
  protected long counter = 0;  
  
  /** Saves the last read character between consecutive calls of getNextTerm().*/
  protected int lastChar = -1;
  
  /** Indicates whether an error has occurred.*/
  protected boolean error;
  
  /**  The tags to process or skip.*/
  protected TagSet _tags = null; 
  
  /** 
   * The tags to process exactly. For these tags,
   * the check() method is not applied.
   */
  protected TagSet _exact = null; 
  
  /** The tags to consider as fields. */
  protected TagSet _fields = null; 
  
  /** The stack where the tags are pushed and popped accordingly. */
  protected Stack<String> stk = new Stack<String>();
  
  /** Indicates whether we are in a tag to process. */
  protected boolean inTagToProcess = false;
  /** Indicates whether we are in a tag to skip. */
  protected boolean inTagToSkip = false;
  
  /** The hash set where the tags, considered as fields, are inserted. */
  protected Set<String> htmlStk = new HashSet<String>();
  /** Specifies whether the tokeniser is in a field tag to process. */
  protected boolean inHtmlTagToProcess = false;


  protected Map<String, String> properties = null;


  protected Tokeniser tokeniser;
  protected TokenStream currentTokenStream;
  
  /** The names of the abstracts to be saved (comma separated list) **/
  protected final String[] abstractnames = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("TaggedDocument.abstracts", "").toLowerCase());
  /** The fields that the named abstracts come from (comma separated list) **/
  protected final String[] abstracttags = ArrayUtils.parseCommaDelimitedString(ApplicationSetup.getProperty("TaggedDocument.abstracts.tags", ""));
  /** The maximum length of each named abstract (comma separated list) **/
  protected final int[] abstractlengths = ArrayUtils.parseDelimitedInts(ApplicationSetup.getProperty("TaggedDocument.abstracts.lengths", ""), ",");
  protected final boolean abstractTagsCaseSensitive = Boolean.parseBoolean(ApplicationSetup.getProperty("TaggedDocument.abstracts.tags.casesensitive", "false"));
  /** number of abstract types */
  protected final int abstractCount = abstractnames.length;
  /** builders for each abstract */
  protected final StringBuilder[] abstracts = new StringBuilder[abstractCount];
  /** else field index **/
  protected int elseAbstractSpecialTag = -1;
  
  /**
   * Constructs an instance of the class from the given input stream.
   * @param docStream
   * @param docProperties
   * @param _tokeniser
   */
  public TaggedDocument(InputStream docStream, Map<String, String> docProperties, Tokeniser _tokeniser)
  {
    String charset = docProperties.get("charset");
    try{
      // Java's decoding of InputStream bytes into characters is
      // inefficient for one character at a time.
      // So we wrap it in BufferedReader which decodes bunches of
      // characters each time.      
      this.br = new BufferedReader(charset != null
        ? new InputStreamReader(docStream, charset)
        : new InputStreamReader(docStream));          
    } catch (UnsupportedEncodingException uee) {
      //logger.warn("Desired encoding ("+charset+") unsupported. Resorting to platform default.", uee);
      this.br = new BufferedReader(new InputStreamReader(docStream));
    }
    this.properties = docProperties;  
    this._tags = new TagSet(TagSet.TREC_DOC_TAGS);
    this._exact = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
    this._fields = new TagSet(TagSet.FIELD_TAGS);
    this.tokeniser = _tokeniser;
    this.currentTokenStream = Tokeniser.EMPTY_STREAM;
    for(int i=0;i<abstractCount;i++)
    {
      abstracts[i] = new StringBuilder(abstractlengths[i]);
      if (! abstractTagsCaseSensitive)
         abstracttags[i] = abstracttags[i].toUpperCase();
      if (abstracttags[i].toUpperCase().equals("ELSE"))
        elseAbstractSpecialTag = i;
    }
  }
  
  /** 
   * Constructs an instance of the class from the given reader object.
   * @param docReader Reader the stream from the collection that ends at the 
   *    end of the current document.
   */
  public TaggedDocument(Reader docReader, Map<String, String> docProperties, Tokeniser _tokeniser)
  {
    this.br = docReader;
    properties = docProperties;  
    this._tags = new TagSet(TagSet.TREC_DOC_TAGS);
    this._exact = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
    this._fields = new TagSet(TagSet.FIELD_TAGS);
    this.tokeniser = _tokeniser;
    this.currentTokenStream = Tokeniser.EMPTY_STREAM;
  }


  /** Returns the underlying buffered reader, so that client code can tokenise the
    * document itself, and deal with it how it likes. */
  public Reader getReader()
  {
    return this.br;
  }


  
  protected final StringBuilder sw = new StringBuilder(tokenMaximumLength);
  protected final StringBuilder tagNameSB = new StringBuilder(10);
  
  /**
   * Returns the next token from the current chunk of text, extracted from the
   * document into a TokenStream.
   * 
   * @return String the next token of the document, or null if the 
   *     token was discarded during tokenisation.
   */
  public String getNextTerm() {


    // consumes the current token stream
    if (currentTokenStream.hasNext()) {
      return currentTokenStream.next();
    }
    
    // if the current token stream is exhausted, construct a new one
    
    //the string to return as a result at the end of this method.
    String s = null;
    //StringBuilder sw = null;
    String tagName = null;
    boolean endOfTagName;
    //are we in a body of a tag?
    boolean btag = true;
    int ch = 0;
    //while not the end of document, or the end of file, or we are in a tag
    while (btag && ch != -1 && !EOD) {
      //initialise the stringbuffer with the maximum length of a term (heuristic)
      //sw = new StringBuilder(tokenMaximumLength);
      boolean tag_close = false;
      boolean tag_open = false;
      error = false;
      try {
        if (lastChar == '<' || lastChar == '&') {
          ch = lastChar;
          lastChar = -1;
        }
        //If not EOF and ch.isNotALetter and ch.isNotADigit and
        //ch.isNot '<' and ch.isNot '&'
        //CONSUME: whitespace
        //while ((ch < 1 && ch != '<' && ch != '&') || Character.isWhitespace((char)ch))
        while (ch != -1 && (( ch != '<' && ch != '&') && Character.isWhitespace((char)ch)))
        {
          ch = br.read();
          counter++;
          //if ch is '>' (end of tag), then there is an error.
          if (ch == '>')
            error = true;
        }
        
        //IDENTIFIES: start of opening or closing tags
        if (ch == '<') {
          ch = br.read();
          counter++;
          //if it is a closing tag, set tag_f true
          if (ch == '/') {
            ch = br.read();
            counter++;
            tag_close = true;
          } else if (ch == '!') { //else if it is a comment, that is <!
            counter++;
            ch = br.read();
            if (ch == '[')
            {
              counter++;
              //CDATA block, read until another [
              while ((ch = br.read()) != '['  && ch != -1) {
                counter++;
              }
            }
            else
            {  //it is a comment  
              //read until you encounter a '<', or a '>', or the end of file
              while ((ch = br.read()) != '>' && ch != '<' && ch != -1) {
                counter++;
              } 
              counter++;
            }
          } else {
            tag_open = true; //otherwise, it is an opening tag
          }
        }
        
        if (ch == '&' ) {
          //read until an opening or the end of a tag is encountered, or the 
          //end of file, or a space, or a semicolon,
          //which means the end of the escape sequence &xxx;
          while ((ch = br.read()) != '>' && 
              ch != '<' && 
              ch != ' ' && 
              ch != ';' &&
              ch != -1) {
            counter++;
          } 
          counter++;
           
        }
        
        //if the body of a tag is encountered
        if ((btag = (tag_close || tag_open))) {
          endOfTagName = false;
          //read until the end of file, or the start, or the end 
          //of a tag, and save the content of the tag
          while (ch != -1 && ch != '<' && ch != '>') {
            if (! endOfTagName)
              tagNameSB.append((char)ch);
            ch = br.read();
            counter++;
            if (! endOfTagName && Character.isWhitespace((char)ch)) {
              endOfTagName = true;
              tagName = tagNameSB.toString();
              //System.err.println("Found tag  " + tagName + (tag_open ? "open" : "close") );
            }
          }
          //ch = br.read();counter++;
          if (! endOfTagName)
          {
            tagName = tagNameSB.toString();
            //System.err.println("Found tag " + tagName+ (tag_open ? "open" : "close"));
            tagNameSB.setLength(0);
          }
        } else { //otherwise, if we are not in the body of a tag
          //read text to tokenise
          if (((char)ch) == '>') {
            counter++;
            ch = br.read();
          }
          while (ch != -1 && ch != '<' && ch != '&')
          {
            sw.append((char)ch);
            ch = br.read();
            counter++;
          }
//          while (ch != -1
//              && (//ch=='&' || 
//                ((ch >= 'A') && (ch <= 'Z'))
//               || ((ch >= 'a') && (ch <= 'z'))
//               || ((ch >= '0') && (ch <= '9')))) {
//            sw.append((char)ch);
//            ch = br.read();
//            counter++;
//          }
        }
        lastChar = ch;
        s = sw.toString();
        sw.setLength(0);
        if (tag_open) {
          //System.err.println("processing open " + tagName);
          if ((_tags.isTagToProcess(tagName) || _tags.isTagToSkip(tagName)) && !tagName.equals("")) {
            stk.push(tagName.toUpperCase());
            if (_tags.isTagToProcess(tagName)) {
              inTagToProcess = true;
              inTagToSkip = false;
            } else {
              inTagToSkip = true;
              inTagToProcess = false;
              continue;
            }
          }
          if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
            htmlStk.add(tagName.toUpperCase());
            inHtmlTagToProcess = true;
          }
        }
        if (tag_close) {
          //System.err.println("processing close " + tagName);
          if ((_tags.isTagToProcess(tagName) || _tags.isTagToSkip(tagName)) && !tagName.equals("")) {
            processEndOfTag(tagName.toUpperCase());
            String stackTop = null;
            if (!stk.isEmpty()) {
              stackTop = stk.peek();
              if (_tags.isTagToProcess(stackTop)) {
                inTagToProcess = true;
                inTagToSkip = false;
              } else {
                inTagToProcess = false;
                inTagToSkip = true;
                continue;
              }
            } else {
              inTagToProcess = false;
              inTagToSkip = false;
            }
          }
          if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
            htmlStk.remove(tagName.toUpperCase());
          }
        }
        
      } catch (IOException ioe) {
        //logger.warn("Input/Output exception during reading tokens. Document "+ this.getProperty("docno"), ioe);
        return null;
      }
    }
    if (ch == -1) {
      processEndOfDocument();      
    }
    boolean hasWhitelist = _tags.hasWhitelist();
    if (!btag && 
        (!hasWhitelist || (hasWhitelist && inTagToProcess )) && 
        !inTagToSkip) 
    {
      if (!stk.empty() && _exact.isTagToProcess(stk.peek()))
        return lowercase ? s.toLowerCase() : s;
      //}
      saveToAbstract(s,tagName);
      currentTokenStream = tokeniser.tokenise(new StringReader(s));
      if (currentTokenStream.hasNext())
        return currentTokenStream.next();
//      if (lowercase)
//        return check(s.toLowerCase());
//      return(check(s));
    }
    return null;
  }
  
  protected void processEndOfDocument()
  {
    EOD = true;
    for(int abstractId=0;abstractId<abstractCount;abstractId++)
    {
      setProperty(abstractnames[abstractId], abstracts[abstractId].toString().trim());
    }
  }
  
  /**
   * This method takes the text parsed from a tag and then saves it to the
   * abstract(s). This method contains the logic to decide whether indeed the
   * text or some subset of it should be saved. The default behaviour checks each
   * abstract named in TaggedDocument.absracts, if for an abstract we are in the
   * correct field (specified in TaggedDocument.abstracts.tags) and then it
   * saves up to maximum character length specified in TaggedDocument.abstracts.lengths.
   * 
   * The 'ELSE' abstract tag is a special case that will be filled with any tag that 
   * is not added to an existing abstract. 
   * 
   * TaggedDocument should be sub-classed and this method overwritten if you want
   * to save abstracts in a different manner, e.g. saving the first paragraph.
   * @param text - the text to be saved
   * @param tag - the tag that this text came from
   */
  protected void saveToAbstract(String text, String tag) {
    if (tag == null) return;
    if ( ! abstractTagsCaseSensitive)
      tag = tag.toUpperCase();
    
    boolean tagFound = false;
    for (int i = 0; i<abstractCount;i++)
    {
      if ( abstracttags[i].equals(tag)) {
        tagFound = true;
        final int maxAbstractLength = abstractlengths[i];
        final int currentAbstractLength = abstracts[i].length();
        final int textLength = text.length();
        //abstract >= maxAbstractLength
        //abstract + text <= maxAbstractLength        
        //abstract + text > maxAbstractLength 
        if (currentAbstractLength<maxAbstractLength) 
        {          
          if (currentAbstractLength + textLength < maxAbstractLength)
          {
            abstracts[i].append(' ');
            abstracts[i].append(text);
          }
          else
          {
            abstracts[i].append(' ');
            abstracts[i].append(text.substring(0, maxAbstractLength - currentAbstractLength));
          }
        }
      }
    }
    if (elseAbstractSpecialTag != -1 && ! tagFound)
    {
      final int maxAbstractLength = abstractlengths[elseAbstractSpecialTag];
      final int currentAbstractLength = abstracts[elseAbstractSpecialTag].length();
      final int textLength = text.length();
      //abstract >= maxAbstractLength
      //abstract + text <= maxAbstractLength        
      //abstract + text > maxAbstractLength 
      if (currentAbstractLength<maxAbstractLength) 
      {          
        if (currentAbstractLength + textLength < maxAbstractLength)
        {
          abstracts[elseAbstractSpecialTag].append(' ');
          abstracts[elseAbstractSpecialTag].append(text);
        }
        else
        {
          abstracts[elseAbstractSpecialTag].append(' ');
          abstracts[elseAbstractSpecialTag].append(text.substring(0, maxAbstractLength - currentAbstractLength));
        }
      }
    }
  }
  
  /** 
   * Returns the fields in which the current term appears in.
   * @return HashSet a hashset containing the fields that the current
   *     term appears in.
   */
  public Set<String> getFields() {
    return htmlStk;
  }
  /**
   * Indicates whether the tokenizer has reached the end of the 
   * current document.
   * @return boolean true if the end of the current document has
   *     been reached, otherwise returns false.
   */
  public boolean endOfDocument() {
    return EOD && ! currentTokenStream.hasNext();
  }
  
  /**
   * The encountered tag, which must be a final tag is matched with the tag on
   * the stack. If they are not the same, then the consistency is restored by
   * popping the tags in the stack, the observed tag included. If the stack
   * becomes empty after that, then the end of document EOD is set to true.
   * 
   * @param tag The closing tag to be tested against the content of the stack.
   */
  protected void processEndOfTag(String tag) {
    //if there are no tags in the stack, return
    if (stk.empty())
      return;
    //if the given tag is on the top of the stack then pop it
    if (tag.equals(stk.peek()))
      stk.pop();
    else { //else report an error, and find the tag.
      int _counter = 0;
      int x = stk.search(tag);
      while (!stk.empty() & _counter < x) {
        _counter++;
        stk.pop();
      }
    }
  }
  
  /** The maximum number of digits that are allowed in valid terms. */
  protected static final int maxNumOfDigitsPerTerm = 4;
  
  /** 
   * The maximum number of consecutive same letters or digits that are 
   * allowed in valid terms.
   */
  protected static final int maxNumOfSameConseqLettersPerTerm = 3;
  
  /**
   * Checks whether a term is shorter than the maximum allowed length,
   * and whether a term does not have many numerical digits or many 
   * consecutive same digits or letters.
   * @param s String the term to check if it is valid. 
   * @return String the term if it is valid, otherwise it returns null.
   */
  public static String check(String s) {
    //if the s is null
    //or if it is longer than a specified length
    if (s == null)
      return null;
    s = s.trim();
    final int length = s.length();
    if (length == 0 || length > tokenMaximumLength)
      return null;
    
    int counter = 0;
    int counterdigit = 0;
    int ch = -1;
    int chNew = -1;
    for(int i=0;i<length;i++)
    {
      chNew = s.charAt(i);
      if (chNew >= 48 && chNew <= 57)
        counterdigit++;
      if (ch == chNew)
        counter++;
      else
        counter = 1;
      ch = chNew;
      /* if it contains more than 3 consequtive same 
       * letters (or digits), or more than 4 digits, 
       * then discard the term. 
       */
      if (counter > maxNumOfSameConseqLettersPerTerm ||
        counterdigit > maxNumOfDigitsPerTerm)
        return null;
    }
    return s;
  }


  /** Allows access to a named property of the Document. Examples might be URL, filename etc.
    * @param name Name of the property. It is suggested, but not required that this name
    * should not be case insensitive.
    * @since 1.1.0 */
  public String getProperty(String name)
  {
    return properties.get(name.toLowerCase());
  }
  
  /** Allows a named property to be added to the Document. Examples might be URL, filename etc.
    * @param name Name of the property. It is suggested, but not required that this name
    * should not be case insensitive.
    * @param value The value of the property
    * @since 1.1.0 */
  public void setProperty(String name, String value)
  {
    properties.put(name.toLowerCase(),value);
  }


    /** Returns the underlying map of all the properties defined by this Document.
    * @since 1.1.0 */  
  public Map<String,String> getAllProperties()
  {
    return properties;
  }


  /**
   * Static method which dumps a document to System.out
   * @param args A filename to parse
   */
  public static void main(String args[])
  {
    if (args.length == 0)
    {
      logger.fatal("ERROR: Please specify a test file on the command line");
      return;
    }
    Document d = generateDocumentFromFile(args[0]);
    if (d !=  null)
      dumpDocument(d);
  }


  /** instantiates a TREC document from a file */
  public static Document generateDocumentFromFile(final String filename)
  {
    BufferedReader b = null;
    try{
      b = new BufferedReader(new FileReader(filename));
    } catch (IOException ioe) {
      logger.fatal("ERROR: Problem opening TRECDocument test file : "+ ioe);
      logger.fatal("Exiting ...");
      ioe.printStackTrace();
    }
    return new TaggedDocument(b, null, new EnglishTokeniser());
  }
  
  /**
   * Dumps a document to stdout
   * @param d a Document object
   */
  public static void dumpDocument(final Document d)
  {
    int terms = 0;
    while(! d.endOfDocument() )
    {
      String t = d.getNextTerm();
      if (t == null)
        continue;
      terms++;
      System.out.print("term: "+ t);
      System.out.print("; fields = {");
      Set<String> fields = d.getFields();
      java.util.Iterator<String> f = fields.iterator();
      if (f.hasNext())
        System.out.print((f.next()));
      while(f.hasNext())
      {
        System.out.print(","+(f.next()));
      }
      System.out.println("}");
    }
    System.out.println("terms: "+terms);
  }


}
Source Code of org.terrier.indexing.TaggedDocument

Related Classes of org.terrier.indexing.TaggedDocument